In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
import xgboost as xgb
import operator
%matplotlib inline
In [5]:
SEED = 2017
NFOLDS = 5
In [6]:
def train_xgb(X, y, param):
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.005, random_state = SEED)
xg_train = xgb.DMatrix(X_train, label=y_train)
xg_val = xgb.DMatrix(X_val, label = y_val)
watchlist = [(xg_train, 'train'), (xg_val), 'eval']
return xgb.train(param, xg_train, param['n_rounds'])
Function for making predictions
In [7]:
def predict_xgb(clf, X_test):
return clf.predict(xgb.DMatrix(X_test))
Parameters for xgb model
In [8]:
param = {}
param['objective'] = 'multi:softprob'
param['eval_metric'] = 'mlogloss'
param['eta'] = 0.08
param['colsample_tree'] = 0.8
param['subsample'] = 0.8
param['seed'] = SEED
param['max_depth'] = 6
param['n_rounds'] = 350
param['seed'] = 1
param['num_class'] = 3
param['silent'] = 1
In [9]:
def create_feature_map(features):
outfile = open('xgb.fmap','w')
i=0
for feature in features:
outfile.write('{0}\t{1}\tq\n'.format(i,feature))
i = i+1
outfile.close()
In [10]:
train = pd.read_json('train.json')
test = pd.read_json('test.json')
n_train = train.shape[0]
In [11]:
target = {'low':2, 'medium':1, 'high':0}
y_train = train['interest_level'].apply(lambda x : target[x])
X_train = train.drop('interest_level', axis=1)
X_test = test
data = pd.concat([X_train, X_test])
In [12]:
listing_id = test['listing_id'].values
Time feature processing
In [13]:
data['created'] = pd.to_datetime(data['created'])
data['month'] = data['created'].dt.month
data['day'] = data['created'].dt.day
data['week'] = data['created'].dt.week
data['dayofweek'] = data['created'].dt.dayofweek
data['dayofyear'] = data['created'].dt.dayofyear
data['quarter'] = data['created'].dt.quarter
data['hour'] = data['created'].dt.hour
data = data.drop(['created'], axis=1)
Categorical feature
In [14]:
cat = ['display_address', 'manager_id', 'building_id', 'street_address']
for i in cat:
data[i] = LabelEncoder().fit_transform(data[i])
Non_categorical feature
In [15]:
data['n_photos'] = data['photos'].apply(len)
data['n_features'] = data['features'].apply(len)
data['n_description'] = data['description'].apply(lambda x: len(x.split(' ')))
data['l_description'] = data['description'].apply(len)
In [16]:
data['featurecopy'] = data['features']
data['featurecopy'] = data['featurecopy'].apply(lambda x: ' '.join(x))
tvectorizer = TfidfVectorizer(stop_words = 'english', max_features = 200, ngram_range=(1,1))
data_sparse = tvectorizer.fit_transform(data['featurecopy'])
In [17]:
not_features = ['description', 'features', 'listing_id', 'photos','featurecopy', 'features']
is_feature = [i for i in data.columns if i not in not_features]
In [18]:
data = sparse.hstack([data[is_feature], data_sparse]).tocsr()
In [19]:
X_train = data[:n_train]
X_test = data[n_train:]
In [20]:
#feature_names = is_feature
In [21]:
clf = train_xgb(X_train, np.array(y_train.astype(np.int8)), param)
pred = predict_xgb(clf, X_test)
In [22]:
feature_names = is_feature + ['sparse_%d' % i for i in range(data_sparse.shape[1])]
create_feature_map(feature_names)
In [27]:
importance = clf.get_fscore(fmap ='xgb.fmap')
importance = sorted(importance.items(), key = operator.itemgetter(1), reverse=True)
In [28]:
df = pd.DataFrame(importance, columns = ['feature', 'fscore'])
In [39]:
sb.barplot(x='fscore', y='feature', data = df.head(20))
Out[39]:
In [ ]: